# Load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(ROCR)
library(ROCit)
library(pander)
library(cluster)
library(lime)
library(gridExtra)
# Load datasets
death_causes <- read.csv("Countries and death causes.csv")
# Explore structure and summary of death causes data
str(death_causes)
## 'data.frame': 6840 obs. of 31 variables:
## $ Entity : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ Outdoor.air.pollution : int 3169 3222 3395 3623 3788 3869 3943 4024 4040 4042 ...
## $ High.systolic.blood.pressure : int 25633 25872 26309 26961 27658 28090 28587 29021 29349 29712 ...
## $ Diet.high.in.sodium : int 1045 1055 1075 1103 1134 1154 1178 1202 1222 1242 ...
## $ Diet.low.in.whole.grains : int 7077 7149 7297 7499 7698 7807 7943 8075 8173 8265 ...
## $ Alochol.use : int 356 364 376 389 399 406 413 420 425 426 ...
## $ Diet.low.in.fruits : int 3185 3248 3351 3480 3610 3703 3819 3938 4038 4127 ...
## $ Unsafe.water.source : int 3702 4309 5356 7152 7192 8378 8487 9348 9788 9931 ...
## $ Secondhand.smoke : int 4794 4921 5279 5734 6050 6167 6298 6425 6402 6323 ...
## $ Low.birth.weight : int 16135 17924 21200 23795 24866 25534 25997 26246 25805 25080 ...
## $ Child.wasting : int 19546 20334 22895 27002 29205 30943 31628 32736 32760 32271 ...
## $ Unsafe.sex : int 351 361 378 395 410 422 435 448 458 469 ...
## $ Diet.low.in.nuts.and.seeds : int 2319 2449 2603 2771 2932 3049 3173 3298 3401 3482 ...
## $ Household.air.pollution.from.solid.fuels: int 34372 35392 38065 41154 43153 44024 45005 46017 46055 45681 ...
## $ Diet.low.in.Vegetables : int 3679 3732 3827 3951 4075 4153 4247 4339 4409 4473 ...
## $ Low.physical.activity : int 2637 2652 2688 2744 2805 2839 2878 2914 2944 2976 ...
## $ Smoking : int 5174 5247 5363 5522 5689 5801 5934 6066 6178 6288 ...
## $ High.fasting.plasma.glucose : int 11449 11811 12265 12821 13400 13871 14413 14970 15502 16058 ...
## $ Air.pollution : int 37231 38315 41172 44488 46634 47566 48617 49703 49746 49349 ...
## $ High.body.mass.index : int 9518 9489 9528 9611 9675 9608 9503 9286 9024 8857 ...
## $ Unsafe.sanitation : int 2798 3254 4042 5392 5418 6313 6393 7038 7366 7468 ...
## $ No.access.to.handwashing.facility : int 4825 5127 5889 7007 7421 7896 8098 8507 8560 8424 ...
## $ Drug.use : int 174 188 211 232 247 260 274 287 295 302 ...
## $ Low.bone.mineral.density : int 389 389 393 411 413 417 423 425 429 428 ...
## $ Vitamin.A.deficiency : int 2016 2056 2100 2316 2665 3070 3214 3228 3413 3662 ...
## $ Child.stunting : int 7686 7886 8568 9875 11031 11973 12426 12805 13011 13052 ...
## $ Discontinued.breastfeeding : int 107 121 150 204 204 233 233 255 264 263 ...
## $ Non.exclusive.breastfeeding : int 2216 2501 3053 3726 3833 4124 4183 4393 4417 4326 ...
## $ Iron.deficiency : int 564 611 700 773 812 848 883 914 924 909 ...
summary(death_causes)
## Entity Code Year Outdoor.air.pollution
## Length:6840 Length:6840 Min. :1990 Min. : 0
## Class :character Class :character 1st Qu.:1997 1st Qu.: 434
## Mode :character Mode :character Median :2004 Median : 2101
## Mean :2004 Mean : 84582
## 3rd Qu.:2012 3rd Qu.: 11810
## Max. :2019 Max. :4506193
## High.systolic.blood.pressure Diet.high.in.sodium Diet.low.in.whole.grains
## Min. : 2 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1828 1st Qu.: 137.0 1st Qu.: 273.8
## Median : 8770 Median : 969.5 Median : 1444.0
## Mean : 224225 Mean : 40497.2 Mean : 38691.3
## 3rd Qu.: 40356 3rd Qu.: 5169.8 3rd Qu.: 6773.2
## Max. :10845595 Max. :1885356.0 Max. :1844836.0
## Alochol.use Diet.low.in.fruits Unsafe.water.source Secondhand.smoke
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 263.8 1st Qu.: 144.0 1st Qu.: 7.0 1st Qu.: 209
## Median : 1780.5 Median : 834.5 Median : 182.5 Median : 994
## Mean : 54848.6 Mean : 23957.8 Mean : 44086.4 Mean : 30364
## 3rd Qu.: 8368.0 3rd Qu.: 3104.8 3rd Qu.: 5599.2 3rd Qu.: 4348
## Max. :2441973.0 Max. :1046015.0 Max. :2450944.0 Max. :1304318
## Low.birth.weight Child.wasting Unsafe.sex
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 123 1st Qu.: 26 1st Qu.: 97
## Median : 1057 Median : 504 Median : 619
## Mean : 59126 Mean : 49924 Mean : 27646
## 3rd Qu.: 10903 3rd Qu.: 9765 3rd Qu.: 4492
## Max. :3033425 Max. :3430422 Max. :1664813
## Diet.low.in.nuts.and.seeds Household.air.pollution.from.solid.fuels
## Min. : 0 Min. : 0
## 1st Qu.: 27 1st Qu.: 32
## Median : 252 Median : 821
## Mean : 12996 Mean : 83641
## 3rd Qu.: 1998 3rd Qu.: 10870
## Max. :575139 Max. :4358214
## Diet.low.in.Vegetables Low.physical.activity Smoking
## Min. : 0.0 Min. : 0.0 Min. : 1
## 1st Qu.: 109.0 1st Qu.: 92.0 1st Qu.: 894
## Median : 590.5 Median : 521.5 Median : 4987
## Mean : 11982.5 Mean : 16489.1 Mean : 181958
## 3rd Qu.: 2101.8 3rd Qu.: 2820.2 3rd Qu.: 23994
## Max. :529381.0 Max. :831502.0 Max. :7693368
## High.fasting.plasma.glucose Air.pollution High.body.mass.index
## Min. : 3 Min. : 0 Min. : 2
## 1st Qu.: 1178 1st Qu.: 816 1st Qu.: 918
## Median : 4966 Median : 5748 Median : 3917
## Mean : 117554 Mean : 164752 Mean : 89870
## 3rd Qu.: 21639 3rd Qu.: 25050 3rd Qu.: 17968
## Max. :6501398 Max. :6671740 Max. :5019360
## Unsafe.sanitation No.access.to.handwashing.facility Drug.use
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3 1st Qu.: 19 1st Qu.: 31
## Median : 102 Median : 221 Median : 222
## Mean : 31522 Mean : 21800 Mean : 10285
## 3rd Qu.: 3854 3rd Qu.: 3954 3rd Qu.: 1224
## Max. :1842275 Max. :1200349 Max. :494492
## Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 43 1st Qu.: 0.0 1st Qu.: 1.0
## Median : 277 Median : 2.0 Median : 41.5
## Mean : 8182 Mean : 2471.6 Mean : 11164.3
## 3rd Qu.: 1232 3rd Qu.: 230.2 3rd Qu.: 1563.2
## Max. :437884 Max. :207555.0 Max. :833449.0
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 3.0 1st Qu.: 1
## Median : 4.00 Median : 60.5 Median : 12
## Mean : 431.46 Mean : 7171.9 Mean : 1421
## 3rd Qu.: 71.25 3rd Qu.: 1315.5 3rd Qu.: 238
## Max. :33106.00 Max. :505470.0 Max. :73461
head(death_causes)
## Entity Code Year Outdoor.air.pollution High.systolic.blood.pressure
## 1 Afghanistan AFG 1990 3169 25633
## 2 Afghanistan AFG 1991 3222 25872
## 3 Afghanistan AFG 1992 3395 26309
## 4 Afghanistan AFG 1993 3623 26961
## 5 Afghanistan AFG 1994 3788 27658
## 6 Afghanistan AFG 1995 3869 28090
## Diet.high.in.sodium Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits
## 1 1045 7077 356 3185
## 2 1055 7149 364 3248
## 3 1075 7297 376 3351
## 4 1103 7499 389 3480
## 5 1134 7698 399 3610
## 6 1154 7807 406 3703
## Unsafe.water.source Secondhand.smoke Low.birth.weight Child.wasting
## 1 3702 4794 16135 19546
## 2 4309 4921 17924 20334
## 3 5356 5279 21200 22895
## 4 7152 5734 23795 27002
## 5 7192 6050 24866 29205
## 6 8378 6167 25534 30943
## Unsafe.sex Diet.low.in.nuts.and.seeds
## 1 351 2319
## 2 361 2449
## 3 378 2603
## 4 395 2771
## 5 410 2932
## 6 422 3049
## Household.air.pollution.from.solid.fuels Diet.low.in.Vegetables
## 1 34372 3679
## 2 35392 3732
## 3 38065 3827
## 4 41154 3951
## 5 43153 4075
## 6 44024 4153
## Low.physical.activity Smoking High.fasting.plasma.glucose Air.pollution
## 1 2637 5174 11449 37231
## 2 2652 5247 11811 38315
## 3 2688 5363 12265 41172
## 4 2744 5522 12821 44488
## 5 2805 5689 13400 46634
## 6 2839 5801 13871 47566
## High.body.mass.index Unsafe.sanitation No.access.to.handwashing.facility
## 1 9518 2798 4825
## 2 9489 3254 5127
## 3 9528 4042 5889
## 4 9611 5392 7007
## 5 9675 5418 7421
## 6 9608 6313 7896
## Drug.use Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 174 389 2016 7686
## 2 188 389 2056 7886
## 3 211 393 2100 8568
## 4 232 411 2316 9875
## 5 247 413 2665 11031
## 6 260 417 3070 11973
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1 107 2216 564
## 2 121 2501 611
## 3 150 3053 700
## 4 204 3726 773
## 5 204 3833 812
## 6 233 4124 848
In the initial exploration of the data, I used the
str(), summary(), and head()
functions to quickly understand the structure and basic statistics of
the dataset. Based on the insights gained from these functions, I
developed two hypotheses and decided to introduce additional data frames
to support further analysis.
By examining the dataset using the str() function, I
identified several variables that represent key health risk factors,
including Outdoor air pollution, Diet high in sodium, Unsafe water
source, High systolic blood pressure, etc.
These variables reflect the health risk factors that affect different countries.
Using the summary() function to aggregate these
variables, I observed significant differences in health risks across
countries. For example, some countries have much higher levels of air
pollution compared to the global average, while others have a higher
intake of sodium in their diet. These disparities support the first
hypothesis: Countries with higher levels of air pollution, poor
diet, and low access to clean water are likely to have higher mortality
rates.
The head() function provided specific examples of how
these risk factors manifest. For instance, in 1990, Afghanistan had an
air pollution index of 3169, along with high sodium intake in its
population’s diet. These observations prompted me to further investigate
the correlation between health risk factors and mortality rates.
During data analysis, I noticed clear differences in the distribution of health risks and causes of death across countries. This led me to formulate the second hypothesis: Countries can be clustered into distinct groups based on their health risk factors and causes of death, revealing common patterns in their health profiles. To validate this hypothesis, clustering analysis will group countries with similar health risk profiles, uncovering global health patterns and shared characteristics in health risks and causes of death. This approach can also pinpoint high-risk countries and inform public health policies.
To further support the analysis of these two hypotheses, I decided to introduce two new data frames:
Total Population by Country per Year: The total population data is introduced to calculate the proportion of each country’s population affected by various health risk factors. Analyzing absolute numbers could misrepresent cross-country comparisons due to differences in population size. Proportions offer a more accurate and fair reflection of the health impact within each country.
Deaths per Thousand People by Country per Year: The deaths per thousand people metric serves as a vital indicator of health outcomes across countries. Integrating this data will enable me to explore correlations between health risk factors and mortality rates, while also facilitating clustering analysis to identify patterns in causes of death and health risk profiles globally.
By incorporating these data frames, I will be able to conduct a more comprehensive analysis to test the relationship between health risk factors and mortality rates, as well as to identify global patterns in health risks.
Total population comes from WORLD BANK GROUP (https://data.worldbank.org/indicator/SP.POP.TOTL?end=2023&start=1960&view=chart)
population <- read.csv("Population data for all countries from 1960 to 2022.csv")
Mortality data comes from WORLD BANK GROUP (https://data.worldbank.org/indicator/SP.DYN.CDRT.IN?end=2023&start=2000&view=chart).
death_per_thousand <- read.csv("The number of deaths per thousand people in each country from 1960 to 2022.csv")
After conducting an initial exploration of the
population dataset, I used the str() and
summary() functions to analyze the structure and content of
the data. I made the decision to clean the dataset by removing
unnecessary columns and irrelevant year data.
str(population)
## 'data.frame': 263 obs. of 68 variables:
## $ Country : chr "Aruba" "Africa Eastern and Southern" "Afghanistan" "Africa Western and Central" ...
## $ CountryCode : chr "ABW" "AFE" "AFG" "AFW" ...
## $ IndicatorName: chr "Population, total" "Population, total" "Population, total" "Population, total" ...
## $ IndicatorCode: chr "SP.POP.TOTL" "SP.POP.TOTL" "SP.POP.TOTL" "SP.POP.TOTL" ...
## $ X1960 : num 5.46e+04 1.31e+08 8.62e+06 9.73e+07 5.36e+06 ...
## $ X1961 : num 5.58e+04 1.34e+08 8.79e+06 9.93e+07 5.44e+06 ...
## $ X1962 : num 5.67e+04 1.38e+08 8.97e+06 1.01e+08 5.52e+06 ...
## $ X1963 : num 5.75e+04 1.42e+08 9.16e+06 1.04e+08 5.60e+06 ...
## $ X1964 : num 5.82e+04 1.46e+08 9.36e+06 1.06e+08 5.67e+06 ...
## $ X1965 : num 5.88e+04 1.50e+08 9.57e+06 1.08e+08 5.74e+06 ...
## $ X1966 : num 5.93e+04 1.54e+08 9.78e+06 1.11e+08 5.79e+06 ...
## $ X1967 : num 5.95e+04 1.58e+08 1.00e+07 1.13e+08 5.83e+06 ...
## $ X1968 : num 5.95e+04 1.63e+08 1.02e+07 1.16e+08 5.87e+06 ...
## $ X1969 : num 5.93e+04 1.68e+08 1.05e+07 1.19e+08 5.93e+06 ...
## $ X1970 : num 5.91e+04 1.72e+08 1.08e+07 1.21e+08 6.03e+06 ...
## $ X1971 : num 5.88e+04 1.78e+08 1.10e+07 1.24e+08 6.18e+06 ...
## $ X1972 : num 5.89e+04 1.83e+08 1.13e+07 1.27e+08 6.36e+06 ...
## $ X1973 : num 5.94e+04 1.88e+08 1.16e+07 1.31e+08 6.58e+06 ...
## $ X1974 : num 6.00e+04 1.94e+08 1.19e+07 1.34e+08 6.80e+06 ...
## $ X1975 : num 6.07e+04 1.99e+08 1.22e+07 1.38e+08 7.03e+06 ...
## $ X1976 : num 6.12e+04 2.05e+08 1.24e+07 1.41e+08 7.27e+06 ...
## $ X1977 : num 6.15e+04 2.11e+08 1.27e+07 1.45e+08 7.51e+06 ...
## $ X1978 : num 6.17e+04 2.17e+08 1.29e+07 1.49e+08 7.77e+06 ...
## $ X1979 : num 6.20e+04 2.24e+08 1.30e+07 1.53e+08 8.04e+06 ...
## $ X1980 : num 6.23e+04 2.31e+08 1.25e+07 1.58e+08 8.33e+06 ...
## $ X1981 : num 6.26e+04 2.38e+08 1.12e+07 1.62e+08 8.63e+06 ...
## $ X1982 : num 6.31e+04 2.45e+08 1.01e+07 1.67e+08 8.95e+06 ...
## $ X1983 : num 6.37e+04 2.53e+08 9.95e+06 1.72e+08 9.28e+06 ...
## $ X1984 : num 6.42e+04 2.60e+08 1.02e+07 1.76e+08 9.62e+06 ...
## $ X1985 : num 6.45e+04 2.68e+08 1.05e+07 1.81e+08 9.97e+06 ...
## $ X1986 : num 6.46e+04 2.76e+08 1.04e+07 1.86e+08 1.03e+07 ...
## $ X1987 : num 6.44e+04 2.84e+08 1.03e+07 1.91e+08 1.07e+07 ...
## $ X1988 : num 6.43e+04 2.93e+08 1.04e+07 1.96e+08 1.11e+07 ...
## $ X1989 : num 6.46e+04 3.01e+08 1.07e+07 2.01e+08 1.14e+07 ...
## $ X1990 : num 6.57e+04 3.10e+08 1.07e+07 2.07e+08 1.18e+07 ...
## $ X1991 : num 6.79e+04 3.19e+08 1.07e+07 2.12e+08 1.22e+07 ...
## $ X1992 : num 7.02e+04 3.27e+08 1.21e+07 2.18e+08 1.26e+07 ...
## $ X1993 : num 7.24e+04 3.36e+08 1.40e+07 2.24e+08 1.30e+07 ...
## $ X1994 : num 7.47e+04 3.44e+08 1.55e+07 2.30e+08 1.35e+07 ...
## $ X1995 : num 7.70e+04 3.53e+08 1.64e+07 2.36e+08 1.39e+07 ...
## $ X1996 : num 7.94e+04 3.63e+08 1.71e+07 2.42e+08 1.44e+07 ...
## $ X1997 : num 8.19e+04 3.72e+08 1.78e+07 2.49e+08 1.49e+07 ...
## $ X1998 : num 8.44e+04 3.82e+08 1.85e+07 2.55e+08 1.54e+07 ...
## $ X1999 : num 8.69e+04 3.91e+08 1.93e+07 2.62e+08 1.59e+07 ...
## $ X2000 : num 8.91e+04 4.02e+08 1.95e+07 2.70e+08 1.64e+07 ...
## $ X2001 : num 9.07e+04 4.12e+08 1.97e+07 2.77e+08 1.69e+07 ...
## $ X2002 : num 9.18e+04 4.23e+08 2.10e+07 2.85e+08 1.75e+07 ...
## $ X2003 : num 9.27e+04 4.34e+08 2.26e+07 2.93e+08 1.81e+07 ...
## $ X2004 : num 9.35e+04 4.45e+08 2.36e+07 3.01e+08 1.88e+07 ...
## $ X2005 : num 9.45e+04 4.57e+08 2.44e+07 3.10e+08 1.95e+07 ...
## $ X2006 : num 9.56e+04 4.70e+08 2.54e+07 3.19e+08 2.02e+07 ...
## $ X2007 : num 9.68e+04 4.82e+08 2.59e+07 3.28e+08 2.09e+07 ...
## $ X2008 : num 9.80e+04 4.96e+08 2.64e+07 3.37e+08 2.17e+07 ...
## $ X2009 : num 9.92e+04 5.09e+08 2.74e+07 3.46e+08 2.25e+07 ...
## $ X2010 : num 1.00e+05 5.23e+08 2.82e+07 3.56e+08 2.34e+07 ...
## $ X2011 : num 1.01e+05 5.38e+08 2.92e+07 3.66e+08 2.43e+07 ...
## $ X2012 : num 1.02e+05 5.53e+08 3.05e+07 3.77e+08 2.52e+07 ...
## $ X2013 : num 1.03e+05 5.68e+08 3.15e+07 3.87e+08 2.61e+07 ...
## $ X2014 : num 1.04e+05 5.84e+08 3.27e+07 3.98e+08 2.71e+07 ...
## $ X2015 : num 1.04e+05 6.00e+08 3.38e+07 4.09e+08 2.81e+07 ...
## $ X2016 : num 1.05e+05 6.16e+08 3.46e+07 4.20e+08 2.92e+07 ...
## $ X2017 : num 1.05e+05 6.33e+08 3.56e+07 4.31e+08 3.02e+07 ...
## $ X2018 : num 1.06e+05 6.50e+08 3.67e+07 4.43e+08 3.13e+07 ...
## $ X2019 : num 1.06e+05 6.67e+08 3.78e+07 4.54e+08 3.24e+07 ...
## $ X2020 : num 1.07e+05 6.85e+08 3.90e+07 4.66e+08 3.34e+07 ...
## $ X2021 : num 1.07e+05 7.03e+08 4.01e+07 4.78e+08 3.45e+07 ...
## $ X2022 : num 1.06e+05 7.21e+08 4.11e+07 4.90e+08 3.56e+07 ...
## $ X2023 : num 1.06e+05 7.39e+08 4.22e+07 5.03e+08 3.67e+07 ...
summary(population)
## Country CountryCode IndicatorName IndicatorCode
## Length:263 Length:263 Length:263 Length:263
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## X1960 X1961 X1962
## Min. :2.646e+03 Min. :2.888e+03 Min. :3.171e+03
## 1st Qu.:5.131e+05 1st Qu.:5.219e+05 1st Qu.:5.311e+05
## Median :3.708e+06 Median :3.785e+06 Median :3.864e+06
## Mean :1.047e+08 Mean :1.062e+08 Mean :1.081e+08
## 3rd Qu.:2.580e+07 3rd Qu.:2.658e+07 3rd Qu.:2.738e+07
## Max. :2.298e+09 Max. :2.330e+09 Max. :2.375e+09
## X1963 X1964 X1965
## Min. :3.481e+03 Min. :3.811e+03 Min. :4.161e+03
## 1st Qu.:5.408e+05 1st Qu.:5.511e+05 1st Qu.:5.593e+05
## Median :3.946e+06 Median :4.029e+06 Median :4.116e+06
## Mean :1.105e+08 Mean :1.130e+08 Mean :1.154e+08
## 3rd Qu.:2.819e+07 3rd Qu.:2.900e+07 3rd Qu.:2.976e+07
## Max. :2.432e+09 Max. :2.489e+09 Max. :2.547e+09
## X1966 X1967 X1968
## Min. :4.531e+03 Min. :4.930e+03 Min. :5.354e+03
## 1st Qu.:5.679e+05 1st Qu.:5.761e+05 1st Qu.:5.831e+05
## Median :4.204e+06 Median :4.296e+06 Median :4.403e+06
## Mean :1.180e+08 Mean :1.206e+08 Mean :1.233e+08
## 3rd Qu.:3.052e+07 3rd Qu.:3.106e+07 3rd Qu.:3.157e+07
## Max. :2.609e+09 Max. :2.670e+09 Max. :2.734e+09
## X1969 X1970 X1971
## Min. :5.646e+03 Min. :5.665e+03 Min. :5.742e+03
## 1st Qu.:5.873e+05 1st Qu.:5.947e+05 1st Qu.:6.069e+05
## Median :4.513e+06 Median :4.586e+06 Median :4.612e+06
## Mean :1.261e+08 Mean :1.289e+08 Mean :1.317e+08
## 3rd Qu.:3.205e+07 3rd Qu.:3.245e+07 3rd Qu.:3.283e+07
## Max. :2.800e+09 Max. :2.867e+09 Max. :2.934e+09
## X1972 X1973 X1974
## Min. :5.891e+03 Min. :5.934e+03 Min. :6.100e+03
## 1st Qu.:6.244e+05 1st Qu.:6.430e+05 1st Qu.:6.488e+05
## Median :4.725e+06 Median :4.851e+06 Median :4.979e+06
## Mean :1.345e+08 Mean :1.374e+08 Mean :1.402e+08
## 3rd Qu.:3.328e+07 3rd Qu.:3.373e+07 3rd Qu.:3.419e+07
## Max. :3.001e+09 Max. :3.069e+09 Max. :3.136e+09
## X1975 X1976 X1977
## Min. :6.381e+03 Min. :6.668e+03 Min. :6.885e+03
## 1st Qu.:6.615e+05 1st Qu.:6.898e+05 1st Qu.:7.267e+05
## Median :5.060e+06 Median :5.181e+06 Median :5.308e+06
## Mean :1.430e+08 Mean :1.457e+08 Mean :1.485e+08
## 3rd Qu.:3.465e+07 3rd Qu.:3.510e+07 3rd Qu.:3.564e+07
## Max. :3.203e+09 Max. :3.268e+09 Max. :3.334e+09
## X1978 X1979 X1980
## Min. :7.110e+03 Min. :7.332e+03 Min. :7.598e+03
## 1st Qu.:7.815e+05 1st Qu.:7.980e+05 1st Qu.:8.048e+05
## Median :5.429e+06 Median :5.553e+06 Median :5.720e+06
## Mean :1.513e+08 Mean :1.542e+08 Mean :1.571e+08
## 3rd Qu.:3.643e+07 3rd Qu.:3.720e+07 3rd Qu.:3.781e+07
## Max. :3.401e+09 Max. :3.469e+09 Max. :3.538e+09
## X1981 X1982 X1983
## Min. :7.691e+03 Min. :7.672e+03 Min. :7.832e+03
## 1st Qu.:8.123e+05 1st Qu.:8.241e+05 1st Qu.:8.417e+05
## Median :5.863e+06 Median :5.991e+06 Median :6.143e+06
## Mean :1.600e+08 Mean :1.631e+08 Mean :1.663e+08
## 3rd Qu.:3.824e+07 3rd Qu.:3.866e+07 3rd Qu.:3.907e+07
## Max. :3.609e+09 Max. :3.684e+09 Max. :3.760e+09
## X1984 X1985 X1986
## Min. :8.125e+03 Min. :8.313e+03 Min. :8.496e+03
## 1st Qu.:8.598e+05 1st Qu.:8.783e+05 1st Qu.:9.016e+05
## Median :6.282e+06 Median :6.418e+06 Median :6.522e+06
## Mean :1.694e+08 Mean :1.726e+08 Mean :1.759e+08
## 3rd Qu.:3.980e+07 3rd Qu.:4.055e+07 3rd Qu.:4.133e+07
## Max. :3.836e+09 Max. :3.913e+09 Max. :3.993e+09
## X1987 X1988 X1989
## Min. :8.665e+03 Min. :8.844e+03 Min. :9.017e+03
## 1st Qu.:9.262e+05 1st Qu.:9.520e+05 1st Qu.:9.790e+05
## Median :6.693e+06 Median :6.834e+06 Median :6.980e+06
## Mean :1.793e+08 Mean :1.827e+08 Mean :1.861e+08
## 3rd Qu.:4.224e+07 3rd Qu.:4.327e+07 3rd Qu.:4.432e+07
## Max. :4.074e+09 Max. :4.157e+09 Max. :4.238e+09
## X1990 X1991 X1992
## Min. :9.182e+03 Min. :9.354e+03 Min. :9.466e+03
## 1st Qu.:1.012e+06 1st Qu.:1.040e+06 1st Qu.:1.061e+06
## Median :7.096e+06 Median :7.245e+06 Median :7.382e+06
## Mean :1.895e+08 Mean :1.929e+08 Mean :1.963e+08
## 3rd Qu.:4.537e+07 3rd Qu.:4.662e+07 3rd Qu.:4.788e+07
## Max. :4.320e+09 Max. :4.400e+09 Max. :4.479e+09
## X1993 X1994 X1995
## Min. :9.517e+03 Min. :9.559e+03 Min. :9.585e+03
## 1st Qu.:1.081e+06 1st Qu.:1.103e+06 1st Qu.:1.122e+06
## Median :7.495e+06 Median :7.486e+06 Median :7.625e+06
## Mean :1.996e+08 Mean :2.029e+08 Mean :2.062e+08
## 3rd Qu.:4.819e+07 3rd Qu.:4.828e+07 3rd Qu.:4.830e+07
## Max. :4.557e+09 Max. :4.635e+09 Max. :4.713e+09
## X1996 X1997 X1998
## Min. :9.611e+03 Min. :9.630e+03 Min. :9.634e+03
## 1st Qu.:1.146e+06 1st Qu.:1.171e+06 1st Qu.:1.197e+06
## Median :7.683e+06 Median :7.838e+06 Median :7.977e+06
## Mean :2.095e+08 Mean :2.128e+08 Mean :2.160e+08
## 3rd Qu.:4.829e+07 3rd Qu.:4.827e+07 3rd Qu.:4.822e+07
## Max. :4.790e+09 Max. :4.867e+09 Max. :4.944e+09
## X1999 X2000 X2001
## Min. :9.640e+03 Min. :9.638e+03 Min. :9.621e+03
## 1st Qu.:1.224e+06 1st Qu.:1.252e+06 1st Qu.:1.282e+06
## Median :8.010e+06 Median :8.170e+06 Median :8.224e+06
## Mean :2.192e+08 Mean :2.224e+08 Mean :2.257e+08
## 3rd Qu.:4.845e+07 3rd Qu.:4.890e+07 3rd Qu.:4.938e+07
## Max. :5.019e+09 Max. :5.094e+09 Max. :5.169e+09
## X2002 X2003 X2004
## Min. :9.609e+03 Min. :9.668e+03 Min. :9.791e+03
## 1st Qu.:1.314e+06 1st Qu.:1.335e+06 1st Qu.:1.354e+06
## Median :8.372e+06 Median :8.568e+06 Median :8.792e+06
## Mean :2.289e+08 Mean :2.321e+08 Mean :2.353e+08
## 3rd Qu.:4.993e+07 3rd Qu.:5.065e+07 3rd Qu.:5.169e+07
## Max. :5.244e+09 Max. :5.318e+09 Max. :5.392e+09
## X2005 X2006 X2007
## Min. :9.912e+03 Min. :1.003e+04 Min. :1.015e+04
## 1st Qu.:1.362e+06 1st Qu.:1.362e+06 1st Qu.:1.363e+06
## Median :9.026e+06 Median :9.081e+06 Median :9.148e+06
## Mean :2.386e+08 Mean :2.418e+08 Mean :2.451e+08
## 3rd Qu.:5.278e+07 3rd Qu.:5.382e+07 3rd Qu.:5.422e+07
## Max. :5.466e+09 Max. :5.540e+09 Max. :5.613e+09
## X2008 X2009 X2010
## Min. :1.024e+04 Min. :1.023e+04 Min. :1.024e+04
## 1st Qu.:1.419e+06 1st Qu.:1.464e+06 1st Qu.:1.489e+06
## Median :9.220e+06 Median :9.299e+06 Median :9.484e+06
## Mean :2.484e+08 Mean :2.517e+08 Mean :2.551e+08
## 3rd Qu.:5.470e+07 3rd Qu.:5.513e+07 3rd Qu.:5.553e+07
## Max. :5.687e+09 Max. :5.762e+09 Max. :5.839e+09
## X2011 X2012 X2013
## Min. :1.028e+04 Min. :1.044e+04 Min. :1.069e+04
## 1st Qu.:1.515e+06 1st Qu.:1.542e+06 1st Qu.:1.569e+06
## Median :9.726e+06 Median :9.920e+06 Median :1.015e+07
## Mean :2.585e+08 Mean :2.621e+08 Mean :2.656e+08
## 3rd Qu.:5.591e+07 3rd Qu.:5.634e+07 3rd Qu.:5.705e+07
## Max. :5.918e+09 Max. :5.999e+09 Max. :6.080e+09
## X2014 X2015 X2016
## Min. :1.090e+04 Min. :1.088e+04 Min. :1.085e+04
## 1st Qu.:1.597e+06 1st Qu.:1.624e+06 1st Qu.:1.623e+06
## Median :1.028e+07 Median :1.036e+07 Median :1.033e+07
## Mean :2.692e+08 Mean :2.727e+08 Mean :2.763e+08
## 3rd Qu.:5.776e+07 3rd Qu.:5.830e+07 3rd Qu.:5.852e+07
## Max. :6.161e+09 Max. :6.241e+09 Max. :6.321e+09
## X2017 X2018 X2019
## Min. :1.083e+04 Min. :1.086e+04 Min. :1.096e+04
## 1st Qu.:1.635e+06 1st Qu.:1.651e+06 1st Qu.:1.671e+06
## Median :1.030e+07 Median :1.040e+07 Median :1.045e+07
## Mean :2.798e+08 Mean :2.833e+08 Mean :2.868e+08
## 3rd Qu.:5.859e+07 3rd Qu.:5.926e+07 3rd Qu.:5.980e+07
## Max. :6.401e+09 Max. :6.479e+09 Max. :6.555e+09
## X2020 X2021 X2022
## Min. :1.107e+04 Min. :1.120e+04 Min. :1.131e+04
## 1st Qu.:1.693e+06 1st Qu.:1.710e+06 1st Qu.:1.721e+06
## Median :1.061e+07 Median :1.051e+07 Median :1.049e+07
## Mean :2.901e+08 Mean :2.931e+08 Mean :2.958e+08
## 3rd Qu.:6.057e+07 3rd Qu.:6.149e+07 3rd Qu.:6.270e+07
## Max. :6.629e+09 Max. :6.696e+09 Max. :6.754e+09
## X2023
## Min. :1.140e+04
## 1st Qu.:1.736e+06
## Median :1.059e+07
## Mean :2.989e+08
## 3rd Qu.:6.393e+07
## Max. :6.820e+09
By using the str() function, I observed that the dataset
contains the following columns:
While the Country Name column is essential for
identifying each country, the columns Country Code,
Indicator Name, and Indicator Code are
metadata fields that do not contribute directly to the analysis.
Using the summary() function, I observed that:
The Country Code, Indicator Name, and Indicator Code fields show the same value for all entries, meaning that this information is identical across the entire dataset. This indicates that these columns do not provide any additional distinguishing information and are, therefore, redundant. As a result, I decided to remove these columns to simplify the data structure.
The summary of the year columns (from 1960 to 2023) provided
insights into population values across different countries. However,
since my death_causes dataset only includes data from 1990
to 2019, it is crucial to align the time range between the two datasets.
By removing data from the years 1960-1989 and 2020-2023, I can avoid any
inconsistencies in the analysis due to mismatched time periods.
Based on the analysis, I made the following decisions for cleaning the dataset:
Remove the Country Code, Indicator Name, and Indicator Code columns: These columns do not provide meaningful information for the analysis and are redundant.
Remove the data for the years 1960-1989 and
2020-2023: To ensure consistency between the
population and death_causes datasets, I
retained data from 1990 to 2019, as these years match the time period in
the death_causes dataset.
population_clean <- population %>%
select(-c(CountryCode, IndicatorName, IndicatorCode)) %>%
select(Country, X1990:X2019)
In addition, after using the str() and
summary() functions to analyze the
population data, I observed that the dataset is in a
wide format, with each year represented as a separate
column (e.g., X1960, X1990). While this format
is useful for quickly viewing the data, it is not ideal when merging it
with the death_causes dataset.
To facilitate further analysis, I need to transform the wide
format into a long format, where each row
represents the population data for a country in a specific year, rather
than having each year occupy a separate column. This format will make it
easier to merge with other time-series data based on country and year.
Additionally, the year columns that start with the “X” prefix (e.g.,
X1990) need to have the “X” removed for cleaner and clearer
data representation.
I used the pivot_longer() function to transform the year
columns into two columns, “Year” and “Population,” while removing the
“X” prefix from the year values:
population_long <- population_clean %>%
pivot_longer(cols = starts_with("X"), names_to = "Year", values_to = "Population")
population_long$Year <- as.numeric(sub("X", "", population_long$Year))
sum(is.na(population_long))
## [1] 0
The transformed long-format data presents each country and year as a
separate row, facilitating easier merging with other datasets by year.
This approach ensures data consistency and provides a solid foundation
for further analysis. After checking for missing values, the results
confirmed that the population_long data frame contains no
NA or missing entries, indicating the dataset is complete for all
countries and years.
When processing the death_per_thousand dataset, I
followed the same steps as I did for the population
dataset. To avoid redundancy, I will not describe this process in detail
again. In short, for the death_per_thousand dataset, I
removed unnecessary columns (such as CountryCode,
IndicatorName, and IndicatorCode), and
transformed the wide-format data into a long-format, where each row
represents the number of deaths per thousand people in a specific
country for a given year. This ensures that the structure of the
death_per_thousand dataset is consistent with that of
the population dataset, facilitating the subsequent
merging and analysis.
death_clean <- death_per_thousand %>%
select(-c(CountryCode, IndicatorName, IndicatorCode)) %>%
select(Country, X1990:X2019)
death_long <- death_clean %>%
pivot_longer(cols = starts_with("X"), names_to = "Year", values_to = "Population")
death_long$Year <- as.numeric(sub("X", "", death_long$Year))
sum(is.na(death_long))
## [1] 0
ChatGPT provided the initial code for data cleaning using
pivot_longer() and mutate() functions, which
helped me convert the data from wide table format to long table
format.
I removed redundant columns such as country code and indicator code, and adjusted the time range to keep the data from 1990-2019 to ensure consistency between different data sets. These modifications ensured that the data cleaning process met the analysis requirements.
In my initial analysis of the death_causes dataset, I found that it contains both individual country data and aggregated data for regions or income groups, such as “Western Pacific Region (WHO)” and “World Bank Low Income.” Since my research focuses on the relationship between country-specific health risk factors and mortality rates, these aggregated entries are irrelevant. To maintain accuracy and ensure the analysis aligns with my objectives, I will remove these rows.
# Remove irrelevant aggregated data
regions_to_exclude <- c("World", "Western Pacific Region (WHO)", "Eastern Mediterranean Region (WHO)",
"European Region (WHO)", "Region of the Americas (WHO)", "African Region (WHO)",
"World Bank High Income", "World Bank Low Income", "World Bank Upper Middle Income",
"World Bank Lower Middle Income", "Sub-Saharan Africa (WB)", "South Asia (WB)",
"North America (WB)", "Latin America & Caribbean (WB)", "Middle East & North Africa (WB)",
"East Asia & Pacific (WB)", "Europe & Central Asia (WB)")
# Filter the data to remove irrelevant rows
death_causes_filtered <- death_causes %>%
filter(!Entity %in% regions_to_exclude)
Upon further exploration of the data, I identified several columns with a high proportion of 0 values, particularly in Vitamin.A.deficiency, Child.stunting, Discontinued.breastfeeding, and Iron.deficiency. In many countries, these health risk factors appear as 0, which is likely an indication of missing data rather than actual 0 values. It is highly unlikely that these health risk factors are truly zero, especially in countries with larger populations.
zero_ratio <- colSums(death_causes == 0) / nrow(death_causes)
zero_ratio[zero_ratio > 0.2]
## Vitamin.A.deficiency Child.stunting
## 0.4356725 0.2116959
## Discontinued.breastfeeding Iron.deficiency
## 0.3533626 0.2138889
To preserve valuable data, I filled the 0 values in these columns with the column’s median. Using the median, a robust statistic, helps prevent distortion from extreme values and ensures missing data is imputed without introducing significant bias. This approach maintains the integrity of country-specific data while enhancing the reliability of the analysis. However, I decided to fill the 0 values in these four columns with the median, without addressing small values. Handling small values is not suitable for this dataset due to the differences in population size between countries. For instance, in smaller countries, the “Vitamin.A.deficiency” figure might be legitimately small, and thus should not be treated as missing. Therefore, I only treat 0 values as missing data and fill them with the median.
# Fill systemic missing values in specific columns
systemic_missing_cols <- c("Vitamin.A.deficiency", "Child.stunting", "Discontinued.breastfeeding", "Iron.deficiency")
# Use median to fill 0 values
death_causes_clean <- death_causes_filtered %>%
mutate_at(systemic_missing_cols, function(x) ifelse(x == 0, median(x, na.rm = TRUE), x))
# View summary of the imputed columns
summary(death_causes_clean[systemic_missing_cols])
## Vitamin.A.deficiency Child.stunting Discontinued.breastfeeding
## Min. : 1.0 Min. : 1 Min. : 1.0
## 1st Qu.: 1.0 1st Qu.: 17 1st Qu.: 2.0
## Median : 1.0 Median : 24 Median : 2.0
## Mean : 772.2 Mean : 3719 Mean : 132.5
## 3rd Qu.: 109.0 3rd Qu.: 874 3rd Qu.: 44.0
## Max. :65879.0 Max. :331514 Max. :14476.0
## Iron.deficiency
## Min. : 1.0
## 1st Qu.: 6.0
## Median : 8.0
## Mean : 529.1
## 3rd Qu.: 150.0
## Max. :39958.0
Initially, I considered detecting and removing outliers to maintain data consistency. However, upon further analysis, I discovered that a large portion of the data fell under the outlier category. Eliminating these points would lead to significant data loss, undermining the reliability of the analysis.
The identified outliers may represent genuine differences between countries rather than statistical anomalies. For instance, countries with severe air pollution or unsafe water access are not merely outliers but critical cases that warrant inclusion.
By retaining all data points, I ensured the analysis accurately captures the diversity of health risks across countries, supporting my hypothesis of significant disparities in global health risks and mortality rates.
Merging Data: I merged the population_long (total population), death_long (death rate per thousand), and death_causes (health risk factors) datasets. This ensures that each country, for every year, has complete data on population, mortality rate, and relevant health risk factors for further analysis.
Renaming Columns: After merging, R automatically labeled the
population columns as Population.x and
Population.y. I renamed them to population and death rate
respectively for clarity and readability.
Standardizing Health Risk Factors: To ensure a fair comparison of health risk factors across countries with varying population sizes, I converted these factors into rates per thousand people. Using absolute numbers could introduce bias, whereas standardizing them allows for a more accurate assessment of the impact on different populations.
library(dplyr)
health_factors <- c("Outdoor.air.pollution", "High.systolic.blood.pressure",
"Diet.high.in.sodium", "Diet.low.in.whole.grains",
"Diet.low.in.fruits", "Unsafe.water.source", "Secondhand.smoke", "Alochol.use",
"High.body.mass.index",
"Low.birth.weight", "Child.wasting", "Unsafe.sex",
"Diet.low.in.nuts.and.seeds", "Household.air.pollution.from.solid.fuels",
"Diet.low.in.Vegetables", "Low.physical.activity", "Smoking",
"High.fasting.plasma.glucose", "Air.pollution", "Unsafe.sanitation",
"No.access.to.handwashing.facility", "Drug.use", "Low.bone.mineral.density",
"Vitamin.A.deficiency", "Child.stunting", "Discontinued.breastfeeding",
"Non.exclusive.breastfeeding", "Iron.deficiency")
merged_data <- merge(population_long, death_long, by = c("Country", "Year"))
death_causes_clean <- rename(death_causes_clean, Country = Entity)
new_merged_data_clean <- merge(merged_data, death_causes_clean, by = c("Country", "Year"))
new_merged_data_clean <- new_merged_data_clean %>%
rename(population = Population.x,
`death rate` = Population.y)
new_merged_data_clean <- new_merged_data_clean %>%
mutate_at(vars(one_of(health_factors)),
~ (. / population) * 1000)
The following sections analyze each chart and highlight the key insights derived from the data.
ggplot(new_merged_data_clean, aes(x = High.systolic.blood.pressure)) +
geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
labs(title = "Distribution of High.systolic.blood.pressure", x = "High.systolic.blood.pressure (per thousand)", y = "Frequency")
The majority of countries have a low prevalence of high systolic blood pressure, primarily within the range of 1 to 2 people per thousand, with the affected number sharply decreasing as the ratio increases. This indicates that high blood pressure is a relatively mild problem in most countries.
The data shows a right-skewed distribution, meaning that a small number of countries face a more severe high blood pressure issue, where the ratio reaches 5 to 6 people per thousand. These countries may require greater attention in terms of public health management.
This chart highlights countries with a higher prevalence of high blood pressure, supporting the hypothesis that health risk factors vary significantly across regions. While most countries experience mild cases, some face more severe risks.
ggplot(new_merged_data_clean, aes(x = High.systolic.blood.pressure, y = `death rate`)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "High.systolic.blood.pressure vs Death Rate",
x = "High.systolic.blood.pressure Rate (per thousand)",
y = "Death Rate (per thousand)") +
ylim(0, 25)
This chart supports the hypothesis that health risk factors, such as high blood pressure, contribute to a country’s mortality rate. Although it is not the sole determinant, high blood pressure plays a key role.
new_merged_data_clean <- new_merged_data_clean %>%
group_by(Country) %>%
mutate(air_pollution_normalized = scale(Outdoor.air.pollution),
death_rate_normalized = scale(`death rate`))
ggplot(new_merged_data_clean %>% filter(Country == "China"), aes(x = Year)) +
geom_line(aes(y = air_pollution_normalized, color = "Air Pollution")) +
geom_line(aes(y = death_rate_normalized, color = "Death Rate")) +
labs(title = "Outdoor Air Pollution vs Death Rate in China (Scaled)",
x = "Year",
y = "Scaled Rate") +
scale_color_manual(values = c("Air Pollution" = "red", "Death Rate" = "blue"))
This time series supports the hypothesis that air pollution impacts mortality rates, especially during periods of severe pollution.
library(reshape2)
health_factors_long <- melt(new_merged_data_clean, id.vars = c("Country", "Year"), measure.vars = health_factors)
selected_countries <- c("United States", "China", "India", "Japan", "Brazil", "Philippines", "Australia", "France")
health_factors_long_filtered <- health_factors_long %>%
filter(Country %in% selected_countries)
ggplot(health_factors_long_filtered, aes(x = variable, y = Country, fill = value)) +
geom_tile() +
labs(title = "Health Risk Factors Across Selected Countries",
x = "Health Risk Factors", y = "Countries") +
scale_fill_gradient(low = "white", high = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(size = 8),
plot.title = element_text(hjust = 0.5))
China, Japan, and the United States show high values for several health risk factors, such as high systolic blood pressure and smoking, indicating significant public health challenges in these areas.
Australia, France, and Brazil display relatively low levels of health risks, suggesting that these countries have more effective health management and infrastructure in place.
The noticeable differences across countries for the same health risk factors further demonstrate the global disparity in health risks.
The primary goal of this project is to classify countries by mortality risk using a binary classification approach. I selected “death rate per thousand people” as the target variable.
To transform this target variable into a binary classification problem, I decided to use the median of the death rate as the dividing threshold. The classification is as follows:
The median was chosen as the threshold to ensure a balanced split between high and low mortality risks, preventing data imbalance. This approach addresses the challenge of skewed data, where most countries have relatively low death rates, which could bias the model towards predicting low risk. By using the median, I aim to enhance the model’s accuracy and effectiveness, avoiding oversimplified predictions, such as consistently classifying cases as low risk.
Removal of Character Columns:
Removal of Year Column:
Removal of Population Column:
The following R code demonstrates how the data cleaning and target variable generation were implemented:
# Calculate the median of death rate per thousand people
median_death_rate <- median(new_merged_data_clean$`death rate`, na.rm = TRUE)
# Categorize countries into high or low mortality risk based on the median
new_merged_data_clean$death_risk_category <- ifelse(new_merged_data_clean$`death rate` > median_death_rate, 1, 0)
# Remove unnecessary columns
df_cleaned <- subset(new_merged_data_clean, select = -c(Country, Year, population, Code, air_pollution_normalized, death_rate_normalized, `death rate`))
By using the death rate per thousand people as the target variable, I redefined the problem as a binary classification task, categorizing countries into high and low mortality risk groups. The data cleaning process eliminated irrelevant and redundant columns, resulting in a refined dataset ready for feature selection and model development.
In the feature selection process, I performed two rounds to identify optimal feature subsets for enhancing the predictive model’s performance. This involved evaluating features based on their AUC (Area Under the Curve) scores, calculating the mean AUC through cross-validation, and assessing deviance reduction to measure each feature’s contribution. Finally, I analyzed correlation matrices to ensure multicollinearity would not hinder the model’s effectiveness.
First, I split the cleaned dataset (df_cleaned) into a
training set and a test set in a 9:1 ratio. This ensured that there was
sufficient data for model training, while reserving some for model
evaluation. Then, I further split the training set into a calibration
set and an actual training set.
# Data Splitting
# Set random seed
set.seed(729375)
# Split data into 90:10 ratio for training and test sets
df_cleaned$rgroup <- runif(dim(df_cleaned)[1])
dTrainAll <- subset(df_cleaned, rgroup <= 0.9)
dTest <- subset(df_cleaned, rgroup > 0.9)
# Further split training set into calibration set and actual train set
useForCal <- rbinom(n = (dim(dTrainAll)[1]), size = 1, prob = 0.1) > 0
dCal <- subset(dTrainAll, useForCal)
dTrain <- subset(dTrainAll, !useForCal)
# Outcome variable and health factor columns
outcome <- 'death_risk_category'
features <- colnames(df_cleaned)[1:28] # First 28 columns are health factors
By setting a random seed, I ensured that the results would be reproducible, allowing others to verify the results in future iterations or studies.
I used the AUC metric to evaluate the performance of each feature. AUC measures the feature’s ability to distinguish between categories of the outcome variable. The higher the AUC score, the better the feature is at distinguishing between categories. Besides, species distribution models are usually evaluated with cross-validation (Hijmans, 2012), so to ensure the robustness of the AUC scores, I performed 10-fold cross-validation to calculate the mean AUC for each feature.
calcAUC <- function(predcol, outcol) {
perf <- performance(prediction(predcol, outcol == 1), 'auc')
as.numeric(perf@y.values)
}
# Cross-validation to calculate mean AUC
mean_auc <- function(var_col, outcome_col, data, num_folds = 10) {
aucs <- rep(0, num_folds)
for (i in 1:num_folds) {
# Randomly split data into calibration and train folds
fold_flag <- rbinom(n = nrow(data), size = 1, prob = 0.1) > 0
train_fold <- subset(data, !fold_flag)
cal_fold <- subset(data, fold_flag)
# Use as.formula() to dynamically build the formula
formula <- as.formula(paste(outcome_col, "~", var_col))
# Train model and calculate AUC
pred <- glm(formula, data = train_fold, family = binomial(link = 'logit'))
pred_values <- predict(pred, newdata = cal_fold, type = 'response')
# Use column names rather than entire columns to access the target variable in cal_fold
aucs[i] <- calcAUC(pred_values, cal_fold[[outcome_col]])
}
mean(aucs)
}
To further evaluate the contribution of each feature to the model, I computed the deviance reduction for each feature. Deviance reduction measures how much adding a particular feature improves the model’s log-likelihood. The larger the deviance reduction, the greater the improvement the feature brings to the model.
logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue == 1, log(ypred + epsilon), log(1 - ypred + epsilon)), na.rm = TRUE)
}
deviance_reduction <- function(var_col, outcome_col, train_data, logNull) {
# Use as.formula() to dynamically build the formula
formula <- as.formula(paste(outcome_col, "~", var_col))
# Train model
pred <- glm(formula, data = train_data, family = binomial(link = 'logit'))
pred_values <- predict(pred, newdata = train_data, type = 'response')
# Calculate log-likelihood
logLik_model <- logLikelihood(train_data[[outcome_col]], pred_values)
devDrop <- 2 * (logLik_model - logNull)
return(devDrop)
}
Based on the AUC, mean AUC from cross-validation, and deviance reduction scores, I selected the top-performing features to form two combinations (Combination 1 and Combination 2).
# Calculate AUC, mean AUC, and deviance reduction for each feature
logNull <- logLikelihood(dTrain[[outcome]], mean(dTrain[[outcome]]))
for (feature in features) {
print(paste("Feature:", feature))
# AUC
auc <- calcAUC(dTrain[[feature]], dTrain[[outcome]])
# Mean AUC (cross-validation)
mean_auc_val <- mean_auc(feature, outcome, dTrain)
# Deviance Reduction
devDrop <- deviance_reduction(feature, outcome, dTrain, logNull)
print(paste("AUC:", auc, "Mean AUC:", mean_auc_val, "Deviance Reduction:", devDrop))
}
## [1] "Feature: Outdoor.air.pollution"
## [1] "AUC: 0.548282906306779 Mean AUC: 0.548493414646486 Deviance Reduction: 154.06325261823"
## [1] "Feature: High.systolic.blood.pressure"
## [1] "AUC: 0.598024455650449 Mean AUC: 0.607596379334943 Deviance Reduction: 450.622484259349"
## [1] "Feature: Diet.high.in.sodium"
## [1] "AUC: 0.56416766257615 Mean AUC: 0.565009760475735 Deviance Reduction: 232.491078882417"
## [1] "Feature: Diet.low.in.whole.grains"
## [1] "AUC: 0.512910144674066 Mean AUC: 0.512031094157497 Deviance Reduction: 189.824076173522"
## [1] "Feature: Alochol.use"
## [1] "AUC: 0.756521020977254 Mean AUC: 0.761606035707882 Deviance Reduction: 963.300965892275"
## [1] "Feature: Diet.low.in.fruits"
## [1] "AUC: 0.638037183859467 Mean AUC: 0.643797683816133 Deviance Reduction: 309.23245830287"
## [1] "Feature: Unsafe.water.source"
## [1] "AUC: 0.583933793880742 Mean AUC: 0.595216629656728 Deviance Reduction: 778.983451528766"
## [1] "Feature: Secondhand.smoke"
## [1] "AUC: 0.568064826420264 Mean AUC: 0.571056455919691 Deviance Reduction: 104.371221671013"
## [1] "Feature: Low.birth.weight"
## [1] "AUC: 0.577370993286111 Mean AUC: 0.58428664344593 Deviance Reduction: 513.180800282537"
## [1] "Feature: Child.wasting"
## [1] "AUC: 0.592386636352154 Mean AUC: 0.591986782548388 Deviance Reduction: 804.614544613633"
## [1] "Feature: Unsafe.sex"
## [1] "AUC: 0.701542930985899 Mean AUC: 0.720459812998607 Deviance Reduction: 572.707317673186"
## [1] "Feature: Diet.low.in.nuts.and.seeds"
## [1] "AUC: 0.552421517474568 Mean AUC: 0.568728712626263 Deviance Reduction: 259.460609224632"
## [1] "Feature: Household.air.pollution.from.solid.fuels"
## [1] "AUC: 0.662447046764022 Mean AUC: 0.663103663363888 Deviance Reduction: 503.451192595847"
## [1] "Feature: Diet.low.in.Vegetables"
## [1] "AUC: 0.599697583583525 Mean AUC: 0.598992879587325 Deviance Reduction: 138.259650645369"
## [1] "Feature: Low.physical.activity"
## [1] "AUC: 0.509506077476901 Mean AUC: 0.523102460076377 Deviance Reduction: 48.4663133421836"
## [1] "Feature: Smoking"
## [1] "AUC: 0.582497983890557 Mean AUC: 0.60042016698119 Deviance Reduction: 346.549184337729"
## [1] "Feature: High.fasting.plasma.glucose"
## [1] "AUC: 0.500616492261054 Mean AUC: 0.501246253591901 Deviance Reduction: 38.3929413056239"
## [1] "Feature: Air.pollution"
## [1] "AUC: 0.756595835640929 Mean AUC: 0.769405641648051 Deviance Reduction: 923.535332524751"
## [1] "Feature: High.body.mass.index"
## [1] "AUC: 0.498907657329409 Mean AUC: 0.514262606515275 Deviance Reduction: 84.6768563338819"
## [1] "Feature: Unsafe.sanitation"
## [1] "AUC: 0.601900972590627 Mean AUC: 0.593968614865275 Deviance Reduction: 799.654615720418"
## [1] "Feature: No.access.to.handwashing.facility"
## [1] "AUC: 0.606775463705174 Mean AUC: 0.624266559856096 Deviance Reduction: 918.664369167288"
## [1] "Feature: Drug.use"
## [1] "AUC: 0.600867412870067 Mean AUC: 0.602400899708192 Deviance Reduction: 199.918118136354"
## [1] "Feature: Low.bone.mineral.density"
## [1] "AUC: 0.64762414861884 Mean AUC: 0.649275925718364 Deviance Reduction: 295.827858937868"
## [1] "Feature: Vitamin.A.deficiency"
## [1] "AUC: 0.612153617823381 Mean AUC: 0.614426968970445 Deviance Reduction: 837.840116627001"
## [1] "Feature: Child.stunting"
## [1] "AUC: 0.6126971172064 Mean AUC: 0.619724669804491 Deviance Reduction: 351.130349999051"
## [1] "Feature: Discontinued.breastfeeding"
## [1] "AUC: 0.564174342456834 Mean AUC: 0.576079065629465 Deviance Reduction: 49.947179168179"
## [1] "Feature: Non.exclusive.breastfeeding"
## [1] "AUC: 0.604453415726625 Mean AUC: 0.606894901782264 Deviance Reduction: 783.260009285834"
## [1] "Feature: Iron.deficiency"
## [1] "AUC: 0.572750701994735 Mean AUC: 0.56417643555452 Deviance Reduction: 23.6165521611456"
Based on the performance of mean AUC, I selected the following eight features for Combination 1:
# Extract features for combination 1
combination1_features <- c("Air.pollution", "Alochol.use", "Unsafe.sex", "Household.air.pollution.from.solid.fuels", "Diet.low.in.fruits", "Low.bone.mineral.density",
"Vitamin.A.deficiency", "Child.stunting")
# Create combination 1 dataset
combination1 <- df_cleaned[, c(combination1_features, "death_risk_category")]
# View combination 1 dataset
head(combination1)
## # A tibble: 6 × 9
## Air.pollution Alochol.use Unsafe.sex Household.air.pollut…¹ Diet.low.in.fruits
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.48 0.0333 0.0328 3.21 0.298
## 2 3.57 0.0339 0.0336 3.29 0.302
## 3 3.41 0.0312 0.0313 3.16 0.278
## 4 3.18 0.0278 0.0282 2.94 0.249
## 5 3.02 0.0258 0.0265 2.79 0.234
## 6 2.90 0.0247 0.0257 2.68 0.226
## # ℹ abbreviated name: ¹Household.air.pollution.from.solid.fuels
## # ℹ 4 more variables: Low.bone.mineral.density <dbl>,
## # Vitamin.A.deficiency <dbl>, Child.stunting <dbl>, death_risk_category <dbl>
And based on deviance reduction performance, I selected the following eight features for Combination 2:
# Extract features for combination 2
combination2_features <- c("Alochol.use", "Air.pollution", "No.access.to.handwashing.facility", "Unsafe.water.source", "Child.wasting", "Unsafe.sanitation", "Vitamin.A.deficiency", "Unsafe.sex")
# Create combination 2 dataset
combination2 <- df_cleaned[, c(combination2_features, "death_risk_category")]
# View combination 2 dataset
head(combination2)
## # A tibble: 6 × 9
## Alochol.use Air.pollution No.access.to.handwashing.facil…¹ Unsafe.water.source
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0333 3.48 0.451 0.346
## 2 0.0339 3.57 0.477 0.401
## 3 0.0312 3.41 0.488 0.444
## 4 0.0278 3.18 0.500 0.511
## 5 0.0258 3.02 0.480 0.465
## 6 0.0247 2.90 0.481 0.510
## # ℹ abbreviated name: ¹No.access.to.handwashing.facility
## # ℹ 5 more variables: Child.wasting <dbl>, Unsafe.sanitation <dbl>,
## # Vitamin.A.deficiency <dbl>, Unsafe.sex <dbl>, death_risk_category <dbl>
After selecting the features, I calculated the correlation matrix between them to ensure that multicollinearity between the selected features would not negatively impact model performance.
# Calculate correlation matrix for combination 1
correlation_matrix_comb1 <- cor(combination1[, combination1_features])
# Print correlation matrix
print(correlation_matrix_comb1)
## Air.pollution Alochol.use Unsafe.sex
## Air.pollution 1.0000000 0.1172165 0.17564242
## Alochol.use 0.1172165 1.0000000 0.13934230
## Unsafe.sex 0.1756424 0.1393423 1.00000000
## Household.air.pollution.from.solid.fuels 0.8742670 -0.1429639 0.25179431
## Diet.low.in.fruits 0.3042034 0.5404508 -0.06142868
## Low.bone.mineral.density -0.2094188 0.4304340 -0.13692051
## Vitamin.A.deficiency 0.6038015 -0.1403785 0.12751644
## Child.stunting 0.3494752 -0.1385953 0.08206892
## Household.air.pollution.from.solid.fuels
## Air.pollution 0.874267019
## Alochol.use -0.142963932
## Unsafe.sex 0.251794308
## Household.air.pollution.from.solid.fuels 1.000000000
## Diet.low.in.fruits 0.000030934
## Low.bone.mineral.density -0.319083020
## Vitamin.A.deficiency 0.727048208
## Child.stunting 0.467320420
## Diet.low.in.fruits
## Air.pollution 0.304203390
## Alochol.use 0.540450810
## Unsafe.sex -0.061428680
## Household.air.pollution.from.solid.fuels 0.000030934
## Diet.low.in.fruits 1.000000000
## Low.bone.mineral.density 0.161243525
## Vitamin.A.deficiency -0.113951300
## Child.stunting -0.014155128
## Low.bone.mineral.density
## Air.pollution -0.2094188
## Alochol.use 0.4304340
## Unsafe.sex -0.1369205
## Household.air.pollution.from.solid.fuels -0.3190830
## Diet.low.in.fruits 0.1612435
## Low.bone.mineral.density 1.0000000
## Vitamin.A.deficiency -0.1933151
## Child.stunting -0.2593263
## Vitamin.A.deficiency Child.stunting
## Air.pollution 0.6038015 0.34947524
## Alochol.use -0.1403785 -0.13859526
## Unsafe.sex 0.1275164 0.08206892
## Household.air.pollution.from.solid.fuels 0.7270482 0.46732042
## Diet.low.in.fruits -0.1139513 -0.01415513
## Low.bone.mineral.density -0.1933151 -0.25932625
## Vitamin.A.deficiency 1.0000000 0.65322333
## Child.stunting 0.6532233 1.00000000
# Calculate correlation matrix for combination 2
correlation_matrix_comb2 <- cor(combination2[, combination2_features])
# Print correlation matrix
print(correlation_matrix_comb2)
## Alochol.use Air.pollution
## Alochol.use 1.0000000 0.1172165
## Air.pollution 0.1172165 1.0000000
## No.access.to.handwashing.facility -0.1061770 0.6707417
## Unsafe.water.source -0.1217273 0.6633634
## Child.wasting -0.1561581 0.6869346
## Unsafe.sanitation -0.1199397 0.6674219
## Vitamin.A.deficiency -0.1403785 0.6038015
## Unsafe.sex 0.1393423 0.1756424
## No.access.to.handwashing.facility
## Alochol.use -0.1061770
## Air.pollution 0.6707417
## No.access.to.handwashing.facility 1.0000000
## Unsafe.water.source 0.9837737
## Child.wasting 0.9662361
## Unsafe.sanitation 0.9841255
## Vitamin.A.deficiency 0.8685335
## Unsafe.sex 0.3277613
## Unsafe.water.source Child.wasting
## Alochol.use -0.1217273 -0.1561581
## Air.pollution 0.6633634 0.6869346
## No.access.to.handwashing.facility 0.9837737 0.9662361
## Unsafe.water.source 1.0000000 0.9553582
## Child.wasting 0.9553582 1.0000000
## Unsafe.sanitation 0.9991611 0.9582028
## Vitamin.A.deficiency 0.8739208 0.9155177
## Unsafe.sex 0.2859231 0.2033186
## Unsafe.sanitation Vitamin.A.deficiency
## Alochol.use -0.1199397 -0.1403785
## Air.pollution 0.6674219 0.6038015
## No.access.to.handwashing.facility 0.9841255 0.8685335
## Unsafe.water.source 0.9991611 0.8739208
## Child.wasting 0.9582028 0.9155177
## Unsafe.sanitation 1.0000000 0.8779585
## Vitamin.A.deficiency 0.8779585 1.0000000
## Unsafe.sex 0.2802144 0.1275164
## Unsafe.sex
## Alochol.use 0.1393423
## Air.pollution 0.1756424
## No.access.to.handwashing.facility 0.3277613
## Unsafe.water.source 0.2859231
## Child.wasting 0.2033186
## Unsafe.sanitation 0.2802144
## Vitamin.A.deficiency 0.1275164
## Unsafe.sex 1.0000000
In selecting features for Combination 1, I excluded features like
Household.air.pollution.from.solid.fuels,
Diet.low.in.fruits, Low.bone.mineral.density,
and Child.stunting. While these features performed
reasonably well in mean AUC, they showed high multicollinearity in the
correlation matrix, particularly
Household.air.pollution.from.solid.fuels, which had a
correlation of 0.874 with Air.pollution. This high
correlation suggests that these features provided redundant information,
potentially leading to overfitting. Additionally,
Diet.low.in.fruits showed a moderately high correlation
(0.540) with Alochol.use, and its AUC and deviance
reduction scores were not as strong, leading to its exclusion. Although
Low.bone.mineral.density and Child.stunting
had lower correlations with other features, their performance in mean
AUC was inferior to other features, leading to their exclusion.
In selecting features for Combination 2, I excluded features like
Unsafe.sanitation, Unsafe.water.source, and
Vitamin.A.deficiency, despite their strong performance in
deviance reduction. These features exhibited very high correlations with
each other, such as the correlation of 0.999 between
Unsafe.sanitation and Unsafe.water.source,
indicating that they provided nearly identical information. Retaining
such highly correlated features could lead to model overfitting and
reduced generalizability. Additionally, while
Vitamin.A.deficiency performed well in some analyses, its
high correlation with features like Child.wasting (0.915)
led to its exclusion to minimize multicollinearity.
As a result of these analyses, I finalized the following feature sets:
Air.pollution,
Alochol.use, Unsafe.sex,
Vitamin.A.deficiency, death_risk_categoryAlochol.use,
Air.pollution, Child.wasting,
No.access.to.handwashing.facility,
death_risk_categorycombination1 <- df_cleaned[, c("Air.pollution", "Alochol.use", "Unsafe.sex", "Vitamin.A.deficiency", "death_risk_category")]
combination2 <- df_cleaned[, c("Alochol.use", "Air.pollution", "Child.wasting", "No.access.to.handwashing.facility", "death_risk_category")]
I used decision trees and logistic regression as classifiers to train models on the two selected feature sets (Combination 1 and Combination 2). These methods were chosen based on the nature of the dataset, which includes a binary label, “death rate” (0 for low risk, 1 for high risk), and 28 numerical health factors. Decision trees excel at segmenting data and identifying complex patterns, providing interpretable insights into how various health factors influence mortality risk. Logistic regression, as a classic binary classification model, establishes a clear linear relationship between predictors and outcomes while offering probabilistic outputs that aid decision-making. Both classifiers effectively handle the numerical data, balancing complexity with interpretability to ensure reliable predictions.
First, I split each combination’s dataset into training and test sets in a 9:1 ratio. Additionally, the training set was further divided into training and calibration sets using the same 9:1 ratio. This ensures that enough data is available for training, calibration, and testing of the models.
set.seed(729375)
combination1$rgroup <- runif(nrow(combination1))
comb1_train_all <- subset(combination1, rgroup <= 0.9)
comb1_test <- subset(combination1, rgroup > 0.9)
useForCal_comb1 <- rbinom(n = nrow(comb1_train_all), size = 1, prob = 0.1) > 0
comb1_cal <- subset(comb1_train_all, useForCal_comb1)
comb1_train <- subset(comb1_train_all, !useForCal_comb1)
combination2$rgroup <- runif(nrow(combination2))
comb2_train_all <- subset(combination2, rgroup <= 0.9)
comb2_test <- subset(combination2, rgroup > 0.9)
useForCal_comb2 <- rbinom(n = nrow(comb2_train_all), size = 1, prob = 0.1) > 0
comb2_cal <- subset(comb2_train_all, useForCal_comb2)
comb2_train <- subset(comb2_train_all, !useForCal_comb2)
To streamline the evaluation process, I defined several helper functions to calculate AUC, Precision, Recall, F1 score, and Deviance Normalization. Additionally, a function for plotting ROC curves was implemented to visualize model performance. These helper functions ensure consistency in evaluation across different models, facilitating comparison.
calcAUC <- function(predcol, outcol) {
perf <- performance(prediction(predcol, outcol == 1), 'auc')
as.numeric(perf@y.values)
}
logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue, log(ypred + epsilon), log(1 - ypred + epsilon)), na.rm = TRUE)
}
performanceMeasures <- function(ytrue, ypred, model.name = "model", threshold=0.5) {
dev.norm <- -2 * logLikelihood(ytrue, ypred) / length(ypred)
cmat <- table(actual = ytrue, predicted = ypred >= threshold)
accuracy <- sum(diag(cmat)) / sum(cmat)
precision <- cmat[2, 2] / sum(cmat[, 2])
recall <- cmat[2, 2] / sum(cmat[2, ])
f1 <- 2 * precision * recall / (precision + recall)
data.frame(model = model.name, precision = precision, recall = recall, f1 = f1, dev.norm = dev.norm)
}
panderOpt <- function(){
panderOptions("plain.ascii", TRUE)
panderOptions("keep.trailing.zeros", TRUE)
panderOptions("table.style", "simple")
}
pretty_perf_table <- function(model, xtrain, ytrain, xtest, ytest, threshold=0.5) {
panderOpt()
perf_justify <- "lrrrr"
pred_train <- predict(model, newdata = xtrain)
pred_test <- predict(model, newdata = xtest)
trainperf_df <- performanceMeasures(ytrain, pred_train, model.name = "training", threshold = threshold)
testperf_df <- performanceMeasures(ytest, pred_test, model.name = "test", threshold = threshold)
perftable <- rbind(trainperf_df, testperf_df)
pandoc.table(perftable, justify = perf_justify)
}
plot_roc <- function(predcol1, outcol1, predcol2, outcol2) {
roc_1 <- rocit(score = predcol1, class = outcol1 == 1)
roc_2 <- rocit(score = predcol2, class = outcol2 == 1)
plot(roc_1, col = c("blue", "green"), lwd = 3, legend = FALSE, YIndex = FALSE, values = TRUE, asp = 1)
lines(roc_2$TPR ~ roc_2$FPR, lwd = 3, col = c("red", "green"), asp = 1)
legend("bottomright", col = c("blue", "red", "green"), c("Test Data", "Training Data", "Null Model"), lwd = 2)
}
Using the rpart package, I trained decision tree models
on both Combination 1 and Combination
2, predicting the death risk category based on the selected
features.
The decision tree (DT)-based model might be regarded as an interpretable model as its functioning can be easily understood upon inspecting the arrangement of the tree (Sahoo et al., 2023).
fV1 <- paste("death_risk_category", "~", paste(c("Air.pollution", "Alochol.use", "Unsafe.sex", "Vitamin.A.deficiency"), collapse = " + "))
comb1_tree_model <- rpart(fV1, data = comb1_train)
fV2 <- paste("death_risk_category", "~", paste(c("Alochol.use", "Air.pollution", "Child.wasting", "No.access.to.handwashing.facility"), collapse = " + "))
comb2_tree_model <- rpart(fV2, data = comb2_train)
I evaluated the decision tree models based on Precision, Recall, F1 score, and AUC, providing performance metrics for both training and test datasets. Additionally, ROC curves were plotted to visualize model performance.
train_pred_comb1_tree <- predict(comb1_tree_model, newdata = comb1_train, type = "vector")
test_pred_comb1_tree <- predict(comb1_tree_model, newdata = comb1_test, type = "vector")
cal_pred_comb1_tree <- predict(comb1_tree_model, newdata = comb1_cal, type = "vector")
print(paste("Training AUC for Decision Tree - Combination 1: ", calcAUC(train_pred_comb1_tree, comb1_train$death_risk_category)))
## [1] "Training AUC for Decision Tree - Combination 1: 0.917988311423325"
print(paste("Test AUC for Decision Tree - Combination 1: ", calcAUC(test_pred_comb1_tree, comb1_test$death_risk_category)))
## [1] "Test AUC for Decision Tree - Combination 1: 0.911910578133096"
print(paste("Calibration AUC for Decision Tree - Combination 1: ", calcAUC(cal_pred_comb1_tree, comb1_cal$death_risk_category)))
## [1] "Calibration AUC for Decision Tree - Combination 1: 0.916480140443274"
train_perf_comb1_tree <- performanceMeasures(comb1_train$death_risk_category, train_pred_comb1_tree)
test_perf_comb1_tree <- performanceMeasures(comb1_test$death_risk_category, test_pred_comb1_tree)
print("Train Performance for Decision Tree - Combination 1:")
## [1] "Train Performance for Decision Tree - Combination 1:"
print(train_perf_comb1_tree)
## model precision recall f1 dev.norm
## 1 model 0.826395 0.9201183 0.870742 0.6869321
print("Test Performance for Decision Tree - Combination 1:")
## [1] "Test Performance for Decision Tree - Combination 1:"
print(test_perf_comb1_tree)
## model precision recall f1 dev.norm
## 1 model 0.8502024 0.8860759 0.8677686 0.7128738
rpart.plot(comb1_tree_model)
From this decision tree, it can be observed that Air pollution and Alcohol use are the primary factors in distinguishing between high-risk and low-risk categories. By combining these features, the model progressively narrows down the classification, allowing for effective prediction of mortality risk categories across different countries.
# Create a data frame for Combination 1 metrics
data_comb1 <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.6869, 0.8707, 0.8264, 0.9201, 0.9179, # Training values for Combination 1
0.7129, 0.8678, 0.8502, 0.8861, 0.9119) # Test values for Combination 1
)
# Plot the bar chart for Combination 1
ggplot(data_comb1, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Performance Metrics for Combination 1",
y = "Value",
x = "Dataset") +
theme_minimal() +
scale_fill_manual(values = c("Dev Norm" = "orange", "F1" = "blue", "Precision" = "purple", "Recall" = "yellow", "AUC" = "red"))
From the chart, it is evident that the model performs consistently on both the test and training datasets, showing good effectiveness in distinguishing between high-risk and low-risk categories. However, the Dev Norm metric indicates there is room for improvement in the model’s calibration.
plot_roc(test_pred_comb1_tree, comb1_test$death_risk_category, train_pred_comb1_tree, comb1_train$death_risk_category)
This ROC (receiver operating characteristic) curve compares the performance of the decision tree model on the test set and the training set. As can be seen from the figure, the ROC curves of the test set (blue line) and the training set (red line) are very close, which shows that the model performs relatively consistently on both sets, without obvious overfitting or underfitting problems. Both curves are much higher than the dotted random classifier (green line), indicating that the model has good discrimination ability in distinguishing high-risk and low-risk categories, and the AUC (area under the curve) is close to 1, indicating that the overall performance of the model is good.
train_pred_comb2_tree <- predict(comb2_tree_model, newdata = comb2_train, type = "vector")
test_pred_comb2_tree <- predict(comb2_tree_model, newdata = comb2_test, type = "vector")
cal_pred_comb2_tree <- predict(comb2_tree_model, newdata = comb2_cal, type = "vector")
print(paste("Training AUC for Decision Tree - Combination 2: ", calcAUC(train_pred_comb2_tree, comb2_train$death_risk_category)))
## [1] "Training AUC for Decision Tree - Combination 2: 0.913419778893802"
print(paste("Test AUC for Decision Tree - Combination 2: ", calcAUC(test_pred_comb2_tree, comb2_test$death_risk_category)))
## [1] "Test AUC for Decision Tree - Combination 2: 0.910354738313008"
print(paste("Calibration AUC for Decision Tree - Combination 2: ", calcAUC(cal_pred_comb2_tree, comb2_cal$death_risk_category)))
## [1] "Calibration AUC for Decision Tree - Combination 2: 0.910339050948687"
train_perf_comb2_tree <- performanceMeasures(comb2_train$death_risk_category, train_pred_comb2_tree)
test_perf_comb2_tree <- performanceMeasures(comb2_test$death_risk_category, test_pred_comb2_tree)
print("Train Performance for Decision Tree - Combination 2:")
## [1] "Train Performance for Decision Tree - Combination 2:"
print(train_perf_comb2_tree)
## model precision recall f1 dev.norm
## 1 model 0.9186813 0.8389363 0.8769997 0.6650976
print("Test Performance for Decision Tree - Combination 2:")
## [1] "Test Performance for Decision Tree - Combination 2:"
print(test_perf_comb2_tree)
## model precision recall f1 dev.norm
## 1 model 0.9363636 0.8046875 0.8655462 0.6751319
rpart.plot(comb2_tree_model)
This decision tree shows that the lack of access to handwashing facilities and alcohol use have a significant impact on mortality risk, followed by health factors such as child malnutrition.
# Create a data frame for Combination 2 metrics
data_comb2 <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.6651, 0.8770, 0.9187, 0.8389, 0.9134, # Training values for Combination 2
0.6751, 0.8655, 0.9364, 0.8047, 0.9103) # Test values for Combination 2
)
# Plot the bar chart for Combination 2
ggplot(data_comb2, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Performance Metrics for Combination 2",
y = "Value",
x = "Dataset") +
theme_minimal() +
scale_fill_manual(values = c("Dev Norm" = "orange", "F1" = "blue", "Precision" = "purple", "Recall" = "yellow", "AUC" = "red"))
As can be seen from the figure, the Combination 2 model of the decision tree performs relatively consistently on the test set and the training set, and shows relatively balanced results in various performance indicators. In addition, the AUC value remains high, indicating that the model has good performance in distinguishing high-risk and low-risk categories.
plot_roc(test_pred_comb2_tree, comb2_test$death_risk_category, train_pred_comb2_tree, comb2_train$death_risk_category)
The ROC curves of the decision tree combination 1 model and the decision tree combination 2 model on the test set and the training set are similar. However, the curve of the decision tree combination 2 is smoother, especially at a low false positive rate (FPR), indicating that the decision tree combination 2 may have an advantage in processing continuous variables.
ChatGPT helped me generate the initial code for ROC curves and bar charts. I made custom modifications, including adjusting colors, labels, and legends. I also plotted the ROC curves of the training set and the test set in the same chart to intuitively show the performance of the model on different data sets.
The comparison between the performance of Combination 1 and Combination 2 highlights the impact of feature selection on model effectiveness. Although both models use decision trees for modeling, the different feature selections lead to better ROC curve performance for Combination 2 compared to Combination 1. This demonstrates that proper feature selection can significantly enhance a model’s predictive power and accuracy, while inappropriate feature selection can limit the model’s performance.
Unlike traditional linear or normal regression, logistic regression is appropriate for modeling a binary variable (Redirecting, 2024).
I trained the logistic regression model using the same feature combinations, ensuring consistency with the decision tree approach.
fL1 <- paste("death_risk_category", paste(c("Air.pollution", "Alochol.use", "Unsafe.sex", "Vitamin.A.deficiency"), collapse=" + "), sep=" ~ ")
gmodel_comb1 <- glm(fL1, data=comb1_train, family=binomial(link="logit"))
fL2 <- paste("death_risk_category", paste(c("Alochol.use", "Air.pollution", "Child.wasting", "No.access.to.handwashing.facility"), collapse=" + "), sep=" ~ ")
gmodel_comb2 <- glm(fL2, data=comb2_train, family=binomial(link="logit"))
Similar to the decision tree, I evaluated the logistic regression model based on Precision, Recall, F1 score, and AUC, with performance metrics presented for both training and test datasets.
train_pred_comb1 <- predict(gmodel_comb1, newdata=comb1_train, type="response")
test_pred_comb1 <- predict(gmodel_comb1, newdata=comb1_test, type="response")
cal_pred_comb1 <- predict(gmodel_comb1, newdata=comb1_cal, type="response")
print(paste("Training AUC for Combination 1: ", calcAUC(train_pred_comb1, comb1_train$death_risk_category)))
## [1] "Training AUC for Combination 1: 0.91440182275726"
print(paste("Test AUC for Combination 1: ", calcAUC(test_pred_comb1, comb1_test$death_risk_category)))
## [1] "Test AUC for Combination 1: 0.918609815678437"
print(paste("Calibration AUC for Combination 1: ", calcAUC(cal_pred_comb1, comb1_cal$death_risk_category)))
## [1] "Calibration AUC for Combination 1: 0.921417599297784"
train_perf_comb1 <- performanceMeasures(comb1_train$death_risk_category, train_pred_comb1)
test_perf_comb1 <- performanceMeasures(comb1_test$death_risk_category, test_pred_comb1)
print("Train Performance for Combination 1:")
## [1] "Train Performance for Combination 1:"
print(train_perf_comb1)
## model precision recall f1 dev.norm
## 1 model 0.862346 0.7938856 0.8267009 0.7430009
print("Test Performance for Combination 1:")
## [1] "Test Performance for Combination 1:"
print(test_perf_comb1)
## model precision recall f1 dev.norm
## 1 model 0.9095477 0.7637131 0.8302752 0.7207086
data_comb1_log <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.7430, 0.8267, 0.8623, 0.7938, 0.9144, # Training values for Combination 1
0.7207, 0.8302, 0.9095, 0.7637, 0.9186) # Test values for Combination 1
)
ggplot(data_comb1_log, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Logistic Regression Performance Metrics - Combination 1",
y = "Value",
x = "Dataset") +
scale_fill_manual(values = c("Dev Norm" = "orange", "F1" = "blue", "Precision" = "purple", "Recall" = "yellow", "AUC" = "red"))
# Plot ROC curve for Combination 1
plot_roc(test_pred_comb1, comb1_test$death_risk_category, train_pred_comb1, comb1_train$death_risk_category)
The graph suggests that the model performs well on the training set, but does not achieve the same level of performance on the test set. Although both the test (blue) and training (red) curves are significantly better than the random model (represented by the green dashed line), indicating good predictive power, the red training curve outperforms the blue test curve. This implies that the model may be slightly overfitting, as it fits the training data better than the test data.
train_pred_comb2 <- predict(gmodel_comb2, newdata=comb2_train, type="response")
test_pred_comb2 <- predict(gmodel_comb2, newdata=comb2_test, type="response")
cal_pred_comb2 <- predict(gmodel_comb2, newdata=comb2_cal, type="response")
print(paste("Training AUC for Combination 2: ", calcAUC(train_pred_comb2, comb2_train$death_risk_category)))
## [1] "Training AUC for Combination 2: 0.906978174870858"
print(paste("Test AUC for Combination 2: ", calcAUC(test_pred_comb2, comb2_test$death_risk_category)))
## [1] "Test AUC for Combination 2: 0.923415269308945"
print(paste("Calibration AUC for Combination 2: ", calcAUC(cal_pred_comb2, comb2_cal$death_risk_category)))
## [1] "Calibration AUC for Combination 2: 0.916311591827816"
train_perf_comb2 <- performanceMeasures(comb2_train$death_risk_category, train_pred_comb2)
test_perf_comb2 <- performanceMeasures(comb2_test$death_risk_category, test_pred_comb2)
print("Train Performance for Combination 2:")
## [1] "Train Performance for Combination 2:"
print(train_perf_comb2)
## model precision recall f1 dev.norm
## 1 model 0.8633642 0.7957852 0.8281984 0.7781291
print("Test Performance for Combination 2:")
## [1] "Test Performance for Combination 2:"
print(test_perf_comb2)
## model precision recall f1 dev.norm
## 1 model 0.8793103 0.796875 0.8360656 0.7181489
data_comb2_log <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.7781, 0.8281, 0.8633, 0.7957, 0.9069, # Training values for Combination 2
0.7181, 0.8360, 0.8793, 0.7968, 0.9234) # Test values for Combination 2
)
ggplot(data_comb2_log, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Logistic Regression Performance Metrics - Combination 2",
y = "Value",
x = "Dataset") +
scale_fill_manual(values = c("Dev Norm" = "orange", "F1" = "blue", "Precision" = "purple", "Recall" = "yellow", "AUC" = "red"))
# Plot ROC curve for Combination 2
plot_roc(test_pred_comb2, comb2_test$death_risk_category, train_pred_comb2, comb2_train$death_risk_category)
The overall shape of the ROC curve for logistic regression with Combination 2 is similar to the curve for logistic regression with Combination 1. Although the model performs well during training, there is a slight decline in performance on the test data, indicating that there is still room for improvement in the model’s generalization ability.
data_comb1 <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.6869, 0.8707, 0.8264, 0.9201, 0.9179,
0.7129, 0.8678, 0.8502, 0.8861, 0.9119)
)
data_comb2 <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.6651, 0.8770, 0.9187, 0.8389, 0.9134,
0.6751, 0.8655, 0.9364, 0.8047, 0.9103)
)
data_comb1_log <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.7430, 0.8267, 0.8623, 0.7938, 0.9144,
0.7207, 0.8302, 0.9095, 0.7637, 0.9186)
)
data_comb2_log <- data.frame(
Dataset = rep(c("Training", "Test"), each = 5),
Metric = rep(c("Dev Norm", "F1", "Precision", "Recall", "AUC"), 2),
Value = c(0.7781, 0.8281, 0.8633, 0.7957, 0.9069,
0.7181, 0.8360, 0.8793, 0.7968, 0.9234)
)
p1 <- ggplot(data_comb1, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Decision Tree - Combination 1", y = "Value", x = "Dataset") +
theme_minimal()
p2 <- ggplot(data_comb2, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Decision Tree - Combination 2", y = "Value", x = "Dataset") +
theme_minimal()
p3 <- ggplot(data_comb1_log, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Logistic Regression - Combination 1", y = "Value", x = "Dataset") +
theme_minimal()
p4 <- ggplot(data_comb2_log, aes(x = Dataset, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Logistic Regression - Combination 2", y = "Value", x = "Dataset") +
theme_minimal()
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
After considering the performance of all four models, Combination 2 of the logistic regression model demonstrated the best overall performance. Firstly, it achieved the highest AUC score of 0.9234 on the test set, indicating exceptional accuracy in distinguishing between high-risk and low-risk groups. Since the AUC score approaches 1, this shows the logistic regression model’s robust ability to identify mortality risks effectively.
In terms of precision, recall, and F1 score, the logistic regression model with Combination 2 also excelled: it achieved a precision of 0.8793, showing its accuracy in predicting high-risk groups; a recall of 0.7969, demonstrating its effectiveness in capturing actual high-risk cases; and an F1 score of 0.8361, reflecting a well-balanced performance between precision and recall.
Additionally, the model’s Deviance Normalization score of 0.7181 suggests a good fit to the data, avoiding overfitting or underfitting. While the decision tree models for Combination 1 and Combination 2 also delivered solid AUC scores, they exhibited slightly more variability in F1 scores and precision, particularly in balancing recall. Given these considerations, the logistic regression model with Combination 2 emerged as the best choice, offering strong predictive power and stability across all performance metrics.
After evaluating the Combination 2 of the decision tree model as the best model, I decided to use Local Interpretable Model-Agnostic Explanations to further understand the contribution of each feature to specific predictions and verify whether these results are consistent with my hypothesis.
Local interpretable model-agnostic explanations (LIME) are used to show the explainability of the proposed method.(Interpretable Ensemble Deep Learning Model for Early Detection of Alzheimer…: EBSCOhost, 2022)
Although the overall model performed well on the test set, I hope that LIME can explain the behavior of the model on individual predictions to help me understand which health factors play a key role in the classification process of countries with high mortality risk.
I chose to perform LIME explanation on the Combination 2 of the decision tree. Because the Combination 2 model performs better than other models on the test set, showing a high F1 score and good prediction accuracy, it is the most worthy model for in-depth analysis. In addition, LIME is a computationally intensive tool. Applying it to all models will consume a lot of time and resources. Selecting the best performing model for explanation can maximize the value of LIME and ensure that the explained results are representative.
comb2_train$death_risk_category <- as.factor(comb2_train$death_risk_category)
comb2_test$death_risk_category <- as.factor(comb2_test$death_risk_category)
fV2 <- paste("death_risk_category ~",
paste(names(comb2_train)[-which(names(comb2_train) == "death_risk_category")],
collapse = " + "))
comb2_tree_model <- rpart(fV2, data = comb2_train, method = "class")
model_type.rpart <- function(x, ...) {
return("classification")
}
predict_model.rpart <- function(model, newdata, ...) {
if ("death_risk_category" %in% names(newdata)) {
newdata <- newdata[, -which(names(newdata) == "death_risk_category")]
}
pred <- predict(model, newdata, type = "prob")
return(as.data.frame(pred))
}
explainer_comb2 <- lime(
x = comb2_train[, -which(names(comb2_train) == "death_risk_category")],
model = comb2_tree_model,
bin_continuous = TRUE
)
sample_comb2 <- comb2_test[1:5, -which(names(comb2_test) == "death_risk_category")]
explanations_comb2 <- explain(
sample_comb2,
explainer = explainer_comb2,
n_features = 4,
labels = 1
)
for (i in unique(explanations_comb2$case)) {
explanation <- explanations_comb2[explanations_comb2$case == i, ]
p <- plot_features(explanation) +
labs(title = paste("LIME", i, " (Combination2)")) +
scale_fill_manual(values = c("blue", "red")) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 10, hjust = 1),
legend.position = "bottom",
legend.title = element_blank(),
legend.text = element_text(size = 12)
)
print(p)
}
The LIME visualizations reveal that Alcohol Use and No Handwashing Access consistently contribute to high mortality risk predictions, underscoring the importance of personal behavior and sanitation in health outcomes. In contrast, the impact of Air Pollution varies, aligning with high-risk predictions in most cases but deviating in Case 5, reflecting the nuanced role of environmental factors. Overall, the model demonstrates stable predictive logic, suggesting that reducing alcohol consumption and enhancing sanitation facilities are essential strategies for lowering mortality risk.
I referred to the example code of ChatGPT’s LIME model to explain the predictions of the classification model. I selected representative samples from the test set, and modified the chart title and style to make the interpretation results more in line with the report requirements.
I conducted clustering analysis on a dataset with 28 health-related factors to explore patterns in population health across countries.
Before conducting the clustering analysis, I cleaned and standardized the data. Since all features represent ratios of health factors rather than absolute values, no additional conversions or cleaning were required. To place all 28 features on a common scale, I standardized them to have a mean of 0 and a standard deviation of 1.
# Remove the 'death_risk_category' column and standardize the remaining data
df_clustering <- df_cleaned[, !(names(df_cleaned) %in% c("death_risk_category"))]
df_scaled <- scale(df_clustering)
To determine the optimal number of clusters, I applied the Elbow Method, which calculates the within-cluster sum of squares (WSS) for varying cluster numbers and plots the results. By evaluating the improvement of WSS when one cluster is added, the potentially optimal k can be identified (Zhang et al., 2016). However, the plot did not reveal a clear elbow, making it challenging to select the ideal cluster count using this method alone.
# Function to calculate WSS (within-clusters sum of squares) for different cluster numbers
wss_total <- function(df_scaled, kmax){
wss <- numeric(kmax)
for (k in 1:kmax) {
wss[k] <- sum(kmeans(df_scaled, centers=k, nstart=10)$tot.withinss)
}
return(wss)
}
kmax <- 15 # Set the maximum number of clusters to explore
wss <- wss_total(df_scaled, kmax)
# Prepare data for plotting
wss_data <- data.frame(Clusters = 1:kmax, WSS = wss)
# Create the Elbow Method plot with enhanced visualization
ggplot(wss_data, aes(x = Clusters, y = WSS)) +
geom_line(color = "#0E3FDF", size = 1.2) + # Blue line
geom_point(color = "#E61F00", size = 3) + # Orange points
scale_x_continuous(breaks = 1:kmax) + # Show all cluster numbers on the x-axis
theme_minimal() + # Clean, minimalistic theme
labs(title = "Elbow Method for Optimal Clusters",
subtitle = "Using Total Within-Clusters Sum of Squares (WSS)",
x = "Number of clusters K",
y = "Total within-clusters sum of squares (WSS)") +
theme(text = element_text(size = 14),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5))
Since the Elbow Method did not yield a clear result, I applied the Silhouette Method to determine the optimal number of clusters. The Silhouette Method calculates the average silhouette score for different values of K. It considers the K value with maximum average silhouette score as the optimal number of clusters (Swetha Kariyavula & Kalaivani Anbarasan, 2023).
# Enhanced Silhouette Method visualization for determining the optimal number of clusters
library(ggplot2)
library(cluster)
# Function to calculate the average silhouette width for different cluster numbers
silhouette_total <- function(df_scaled, kmax){
avg_silhouette <- numeric(kmax)
for (k in 2:kmax) {
km_res <- kmeans(df_scaled, centers=k, nstart=10)
sil <- silhouette(km_res$cluster, dist(df_scaled))
avg_silhouette[k] <- mean(sil[, 3]) # Extract the silhouette width column
}
return(avg_silhouette)
}
kmax <- 15 # Set the maximum number of clusters to explore
avg_sil <- silhouette_total(df_scaled, kmax)
# Prepare data for plotting
silhouette_data <- data.frame(Clusters = 2:kmax, AvgSilhouetteWidth = avg_sil[2:kmax])
# Create the Silhouette Method plot with enhanced visualization
ggplot(silhouette_data, aes(x = Clusters, y = AvgSilhouetteWidth)) +
geom_line(color = "#0E3FDF", size = 1.2) + # Blue line
geom_point(color = "#E61F00", size = 3) + # Orange points
scale_x_continuous(breaks = 2:kmax) + # Show all cluster numbers on the x-axis
theme_minimal() + # Clean, minimalistic theme
labs(title = "Silhouette Method for Optimal Clusters",
subtitle = "Average Silhouette Width Across Cluster Numbers",
x = "Number of clusters K",
y = "Average Silhouette Width") +
theme(text = element_text(size = 14),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5))
By observing the silhouette width plot, I found that the average silhouette width peaked at 4 clusters, indicating that 4 clusters provide the best separation and cohesion in the data. Thus, I selected 4 as the optimal number of clusters.
After determining the optimal number of clusters, \(k=4\), I used the K-means clustering algorithm to group the data.
# Apply K-means clustering
set.seed(123)
k <- 4 # Optimal number of clusters
km_result <- kmeans(df_scaled, centers=k, nstart=25)
To visualize the clustering results, I performed Principal Component Analysis (PCA) on the standardized data and extracted the first two principal components (PC1 and PC2) for visualization. This allows us to see the clustering distribution more clearly in two-dimensional space.
# Perform PCA and visualize clustering results
pca_result <- prcomp(df_scaled)
pc_data <- data.frame(PC1 = pca_result$x[, 1], PC2 = pca_result$x[, 2], cluster = factor(km_result$cluster))
# Calculate and plot convex hulls
pc_data_hull <- pc_data %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
ggplot(pc_data, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(aes(shape = cluster), size = 3) +
geom_text(aes(label = rownames(pc_data)), hjust = 0, vjust = 1, size = 3) +
geom_polygon(data = pc_data_hull, aes(group = cluster, fill = cluster), alpha = 0.4) +
theme_minimal() +
theme(text = element_text(size = 20))
As shown in the PCA plot, the four clusters are well-separated, with clear groupings in the two-dimensional principal component space.
Using K-means clustering, I grouped the countries into four different clusters, each representing similar health risk profiles across the 28 health factors. The centroids of each cluster reveal the central characteristics of each group, helping us understand the typical health risks associated with each cluster.
km_result$centers # View the cluster centroids
## Outdoor.air.pollution High.systolic.blood.pressure Diet.high.in.sodium
## 1 2.0857998 2.3265096 1.9576864
## 2 -0.1477397 -0.2098379 -0.2161815
## 3 -0.7461092 0.3067698 0.2282145
## 4 -0.6758814 -0.6555480 -0.3953171
## Diet.low.in.whole.grains Alochol.use Diet.low.in.fruits Unsafe.water.source
## 1 2.1129794 1.7165558 1.9179089 -0.5789006
## 2 -0.1574268 -0.2034705 -0.2281992 -0.3590037
## 3 0.3922783 -0.5226601 1.2241670 -0.3921265
## 4 -0.7322057 -0.2434754 -0.3911266 1.7862090
## Secondhand.smoke Low.birth.weight Child.wasting Unsafe.sex
## 1 1.6568965 -0.7627299 -0.5521450 -0.3308018
## 2 -0.1834950 -0.3269858 -0.3636906 -0.0775537
## 3 2.2530140 -0.2353751 -0.2658308 -0.2989023
## 4 -0.4720127 1.7656211 1.7792124 0.5287871
## Diet.low.in.nuts.and.seeds Household.air.pollution.from.solid.fuels
## 1 2.1774236 -0.3897734
## 2 -0.2080765 -0.3413105
## 3 -0.6410455 -0.2456819
## 4 -0.5051699 1.5893319
## Diet.low.in.Vegetables Low.physical.activity Smoking
## 1 1.0232403 1.30682762 1.9547532
## 2 -0.1509441 0.02110706 -0.1232077
## 3 2.5082067 0.20554785 0.5798729
## 4 -0.2205828 -0.91179346 -0.7794369
## High.fasting.plasma.glucose Air.pollution High.body.mass.index
## 1 1.45414835 0.6689784 2.0595836
## 2 -0.04656837 -0.4298416 -0.1175765
## 3 1.44046387 -0.6235538 1.2433504
## 4 -0.82377474 1.2989296 -0.9117315
## Unsafe.sanitation No.access.to.handwashing.facility Drug.use
## 1 -0.5632209 -0.5855916 0.77881607
## 2 -0.3609172 -0.3663812 -0.03011016
## 3 -0.4744780 -0.5421061 1.06773398
## 4 1.7894527 1.8292777 -0.44112772
## Low.bone.mineral.density Vitamin.A.deficiency Child.stunting
## 1 0.68723070 -0.4151433 -0.4229601
## 2 0.02419196 -0.3454339 -0.2977400
## 3 -1.33621305 0.9232558 6.6113584
## 4 -0.43267749 1.5420669 0.9759776
## Discontinued.breastfeeding Non.exclusive.breastfeeding Iron.deficiency
## 1 -0.3302274 -0.5386168 -0.2686841
## 2 -0.1855950 -0.3470254 -0.1509155
## 3 8.3349296 -0.5765474 8.6747011
## 4 0.3645691 1.7268991 0.1680694
## rgroup
## 1 0.0249419168
## 2 -0.0007325916
## 3 -0.0678930897
## 4 -0.0081163894
The centroid values indicate significant differences in health factors among the clusters. Some clusters may exhibit high health risk factors, such as air pollution and high sodium intake, while others may show better performance in these areas.
table(km_result$cluster) # View the number of data points in each cluster
##
## 1 2 3 4
## 553 3451 60 886
By examining the distribution of data points across clusters, I found that the clusters vary in size. Larger clusters represent more common health patterns, while smaller clusters may reflect unique health issues faced by specific countries.
To assess the quality of the clustering, I have used the silhouette method in 2.2 Number of Clusters to measure the cohesion within clusters and the separation between clusters, and found that k = 4 is optimal. The silhouette scores of the four clusters show good separation between the clusters, confirming that the choice of four clusters and the K-means algorithm are reasonable and effective choices for this dataset.
In this project, I systematically analyzed the relationship between health risk factors and mortality rates across countries, covering data exploration, data cleaning and transformation, feature selection, classification modeling, LIME explanation, and clustering analysis.
During the data cleaning phase, I integrated the total population, mortality rate (per thousand people), and health risk factors for each country, ensuring consistency over the 1990–2019 period. Health factors were standardized as rates per thousand to facilitate fair comparisons of risks across countries.
In the feature selection phase, I used cross-validated mean AUC and deviance reduction as evaluation criteria to identify the most relevant health factors and constructed two feature combinations. For classification modeling, I applied decision tree and logistic regression models to predict high-risk and low-risk countries. The performance evaluation showed that Combination 2’s decision tree model outperformed the others, indicating strong predictive power in distinguishing between countries with high and low mortality risks.
To further understand the decision-making process of the model, I employed LIME (Local Interpretable Model-Agnostic Explanations) to analyze the prediction logic of the decision tree model for Combination 2. The analysis revealed that alcohol use and lack of handwashing facilities consistently played significant roles in predicting high mortality risk across all cases, highlighting the impact of personal behaviors and basic sanitation on health outcomes.
Finally, in the clustering analysis, I used the K-means algorithm to group countries based on 28 health-related risk factors. Using the Silhouette Method, I identified four optimal clusters. The clustering results demonstrated significant variations in health risk factors across countries.
This project’s comprehensive data analysis confirmed a strong link between health risk factors and mortality rates. Additionally, the classification models and clustering analysis uncovered patterns and disparities in health risks across countries.
Hijmans, R. J. (2012). Cross-validation of species distribution models: removing spatial sorting bias and calibration with a null model. Ecology (Durham), 93(3), 679–688. https://doi.org/10.1890/11-0826.1
Sahoo, A., Bhattacharya, S., Jana, S., & Sujoy Baitalik. (2023). Neural network and decision tree-based machine learning tools to analyse the anion-responsive behaviours of emissive Ru(ii)–terpyridine complexes. Dalton Transactions, 52(1), 97–108. https://doi.org/10.1039/d2dt03289a
Redirecting. (2024). Proquest.com. https://ebookcentral.proquest.com/lib/uwa/reader.action?docID=4744194
Zhang, Y., Moges, S., & Block, P. (2016). Optimal Cluster Analysis for Objective Regionalization of Seasonal Precipitation in Regions of High Spatial–Temporal Variability: Application to Western Ethiopia. Journal of Climate, 29(10), 3697–3717. https://doi.org/10.1175/jcli-d-15-0582.1
Interpretable ensemble deep learning model for early detection of Alzheimer…: EBSCOhost. (2022). Ebscohost.com. https://onlinelibrary.wiley.com/doi/epdf/10.1002/ima.22762
Swetha Kariyavula, & Kalaivani Anbarasan. (2023). Improved accuracy of cluster formation for customer segmentation by identification of novel optimal number of clusters using silhouette method over K-means clustering. AIP Conference Proceedings. https://doi.org/10.1063/5.0119477